#------ Packages
library(ggplot2)
library(car)
library(questionr)
library(survey)



#---------- CPS data for weight construction
#Read in CPS Data
cps = read.csv("cps_data.csv")

#Read in survey data from Latino Decisions-- 
latdec = read.csv(file = "survey_data.csv")
length(latdec$respid)


#---- Load in state policy data
sdata = read.csv(file="state_data.csv")
length(sdata$State)
table(sdata$state)
names(sdata)

#--- merge in state data to survey data
table(latdec$state)
data = merge(latdec, sdata, by="state", all.x = T)
length(data$respid)
names(data)
table(data$VoterID)
table(data$state)
table(data$State)
data$State[data$state == "District of Columbia"] = "DC"

#---- Merge in Uggen disenfranchisement rate data
uggen = read.csv(file="uggen_data.csv")
length(uggen$PctDisenfranchised)
table(uggen$state)
data = merge(data, uggen, by = "state", all.x = T)
names(data)
table(data$PctDisenfranchised)
table(data$state, data$PctDisenfranchised)




#------ RECODE ORIGINAL SURVEY DATA TO MATCH CPS CATEGORIES

#Gender
summary(data$ccc003)
data$demGender[data$ccc003=="Male"]=1
data$demGender[data$ccc003=="Female"]=2
table(data$ccc003, data$demGender)

##Age: birth year
summary(data$ccc004)
data$year1 <- data$ccc004
table(data$year1)
#construct age categories from birth year
data$ageDem <- rep(NA, length(data$respid))
data$ageDem[data$year1 > 1989 ]=1 #{Between 18 and 24}
data$ageDem[data$year1 < 1990 & data$year1 > 1979   ]=2 #- {Between 25 and 34}
data$ageDem[data$year1 < 1980 & data$year1 > 1969   ]= 3 # {Between 35 and 44}
data$ageDem[data$year1 < 1970 & data$year1 > 1959   ]= 4 #Between 45 and 54} 
data$ageDem[data$year1 < 1960 & data$year1 > 1949   ]= 5 #{Between 55 and 64}
data$ageDem[data$year1 < 1950 & data$year1 > 1939   ]= 6  # {Between 65 and 74}
data$ageDem[data$year1 < 1940 ]= 7  #{75+}
table(data$year1, data$ageDem)

##Education
summary(data$ccc002)
data$edu5 <- rep(NA, length(data$respid))
data$edu5[data$ccc002=="Did not graduate from high school"]=1
data$edu5[data$ccc002=="High school graduate"]=2
data$edu5[data$ccc002=="Some college, but no degree (yet)"]=3
data$edu5[data$ccc002=="2-year college degree"]=3
data$edu5[data$ccc002=="4-year college degree"]=4
data$edu5[data$ccc002=="Post-graduate degree (MA, MBA, MD, JD, PhD, etc.)"]=5
table(data$ccc002, data$edu5)  

#Income
table(data$ccc012)
data$demInc[data$ccc012 == "Less than $10,000"] <- 1 
data$demInc[data$ccc012 == "$10,000 - $19,999"] <-1
data$demInc[data$ccc012 == "$20,000 - $29,999"] <- 2
data$demInc[data$ccc012 == "$30,000 - $39,999"] <- 2
data$demInc[data$ccc012 == "$40,000 - $49,999"] <-2
data$demInc[data$ccc012 == "$50,000 - $59,999"] <- 3
data$demInc[data$ccc012 == "$60,000 - $69,999"] <- 3
data$demInc[data$ccc012 == "$70,000 - $79,999"] <- 4  #A little bit off in the match here (CPS is 50-75)
data$demInc[data$ccc012 == "$80,000 – $99,999"] <- 4 #has a dash not a hyphen in file
data$demInc[data$ccc012 == "$100,000 - $119,999"] <- 5
data$demInc[data$ccc012 == "$120,000 - $149,999"] <- 5
data$demInc[data$ccc012 == "$150,000 or more"] <- 5        
table(data$ccc012, data$demInc)
sum(table(data$demInc))


# ---- Recode Other Variables
#police stops
#Recoding police stops by joining with frequency
table(data$polstop)
table(data$polfreq)
data$polstop_freq <- rep(NA, length(data$respid))
data$polstop_freq[data$polstop==0] <- 0
data$polstop_freq[data$polfreq==0] <- 1
data$polstop_freq[data$polfreq==1] <- 2
data$polstop_freq[data$polfreq==2] <- 3
data$polstop_freq[data$polfreq==3] <- 4
data$polstop_freq[data$polfreq==4] <- 5
data$polstop_freq[data$polfreq==5] <- 6
table(data$polstop_freq)




##CLEAN VARIABLES FOR ANALYSIS:

#Make Felony Disenfranchisement Policy Numeric
data$disenfranchise <- rep(NA, length(data$respid))
data$disenfranchise <- recode(data$pol.fel, "'Disenfranchised After Felony' = 1; 'Enfranchised After Felony' = 0; else= NA", as.factor=FALSE)
table(data$pol.fel, data$disenfranchise)

#code social ties variable to be binary, dropping the 137 respondents with many ties
table(data$netcon2)
data$netcon_two <- rep(NA, length(data$respid))
data$netcon_two <- recode(data$netcon2, "0=0; 1=1; 2=NA; else= NA")

table(data$netcon_two)

#code personal conviction status variable
data$con2 <- recode(data$con1, "0 = 0; 1 =0; 2 = 1; 3 = 1 ") #95 RESPONDENTS HAVE CONVICTIONS
table(data$con2)
table(data$con1)

##code region, from census regions codings.


names(data)
table(data$state)
data$region[data$state == "Washington"] = "West"
data$region[data$state == "Oregon"] = "West"
data$region[data$state == "California"] = "West"
data$region[data$state == "Montana"] = "West"
data$region[data$state == "Idaho"] = "West"
data$region[data$state == "Wyoming"] = "West"
data$region[data$state == "Nevada"] = "West"
data$region[data$state == "Utah"] = "West"
data$region[data$state == "Colorado"] = "West"
data$region[data$state == "Arizona"] = "West"
data$region[data$state == "New Mexico"] = "West"
data$region[data$state == "Hawaii"] = "West"
data$region[data$state == "Alaska"] = "West"

data$region[data$state == "North Dakota"] = "Midwest"
data$region[data$state == "South Dakota"] = "Midwest"
data$region[data$state == "Minnesota"] = "Midwest"
data$region[data$state == "Wisconsin"] = "Midwest"
data$region[data$state == "Michigan"] = "Midwest"
data$region[data$state == "Nebraska"] = "Midwest"
data$region[data$state == "Iowa"] = "Midwest"
data$region[data$state == "Kansas"] = "Midwest"
data$region[data$state == "Missouri"] = "Midwest"
data$region[data$state == "Illinois"] = "Midwest"
data$region[data$state == "Indiana"] = "Midwest"
data$region[data$state == "Ohio"] = "Midwest"

data$region[data$state == "Oklahoma"] = "South"
data$region[data$state == "Texas"] = "South"
data$region[data$state == "Arkansas"] = "South"
data$region[data$state == "Louisiana"] = "South"
data$region[data$state == "Mississippi"] = "South"
data$region[data$state == "Alabama"] = "South"
data$region[data$state == "Tennessee"] = "South"
data$region[data$state == "Kentucky"] = "South"
data$region[data$state == "West Virginia"] = "South"
data$region[data$state == "Maryland"] = "South"
data$region[data$state == "Delaware"] = "South"
data$region[data$state == "District of Columbia"] = "South"
data$region[data$state == "Virginia"] = "South"
data$region[data$state == "North Carolina"] = "South"
data$region[data$state == "South Carolina"] = "South"
data$region[data$state == "Georgia"] = "South"
data$region[data$state == "Florida"] = "South"

data$region[data$state == "Pennsylvania"] = "Northeast"
data$region[data$state == "New York"] = "Northeast"
data$region[data$state == "New Jersey"] = "Northeast"
data$region[data$state == "Connecticut"] = "Northeast"
data$region[data$state == "Rhode Island"] = "Northeast"
data$region[data$state == "Massachusetts"] = "Northeast"
data$region[data$state == "New Hampshire"] = "Northeast"
data$region[data$state == "Vermont"] = "Northeast"
data$region[data$state == "Maine"] = "Northeast"

table(data$state, data$region)





# -------- SAMPLE 1: 
# REMOVE MISSING CASES FOR WEIGHT VARIABLES: INCOME, AGE, EDUCATION, Gender

#Slim out Survey with missing weight variables:
sum(is.na(data$edu5)) # 0 missing income
sum(is.na(data$demGender)) # No missing on gender
sum(is.na(data$demInc)) # 90 missing

data.slim1 <- data[!is.na(data$demInc),]
length(data.slim1$respid) ##1110 RESPONDENTS

sum(is.na(data.slim1$ageDem)) ##no missing on age.

#------ SAMPLE 2: 
# ------ REMOVE MISSING CASES FOR WHOLE ANALYSES: participation variables, linked fate, ideology, conviction history, connections to felons, police stops


# - No missing cases: participation scale, linked fate, ideology, voting
sum(is.na(data.slim1$ideo5))
sum(is.na(data.slim1$cleanfate))
sum(is.na(data.slim1$voter))
data.slim1$part_scale <- rep(NA, length(data.slim1$respid))
data.slim1$part_scale <- data.slim1$write + data.slim1$pet + data.slim1$rally + data.slim1$polcamp
sum(is.na(data.slim1$part_scale))


# -------- Remove MISSING CASES
# Variables: conviction history, connections to felons, police stops

sum(is.na(data.slim1$con1))   # 37 missing personal conviction question
data.slim2= data.slim1[complete.cases(data.slim1$con1),]
table(data.slim1$con1)
table(data.slim2$con1)
sum(is.na(data.slim2$con1))
length(data.slim2$respid) ##1073 RESPONDENTS

sum(is.na(data.slim2$netcon2))   # 108 missing social ties question
data.slim3= data.slim2[complete.cases(data.slim2$netcon2),]
table(data.slim2$netcon2)
table(data.slim3$netcon2)
sum(is.na(data.slim3$netcon2))
length(data.slim3$respid) #965 RESPONDENTS


sum(is.na(data.slim3$polstop_freq))  # 6 missing police stops question
data.slim4=data.slim3[complete.cases(data.slim3$polstop_freq),]
table(data.slim3$polstop_freq)
table(data.slim4$polstop_freq)
sum(is.na(data.slim4$netcon2))
length(data.slim4$respid) ##959 RESPONDENTS


sum(is.na(data.slim4$VoterID)) #6 respondents cut for being in DC, which doesn't have data on Voter ID and punrate
sum(is.na(data.slim4$PunRate))

data.slim5=data.slim4[complete.cases(data.slim4$VoterID),]

sum(is.na(data.slim5$VoterID))
sum(is.na(data.slim5$PunRate))




##CONSTRUCTING WEIGHT

#------------CLEAN CPS DATA
#Population Weights
cps$wts_cps <- cps$PWCMPWGT  # Final composite weight

#Race
cps$demRace <- cps$RECODE2 #(1 = White, 2 = Black, 3 = American Indian, 4= Asian, 5 = Other)
table(cps$demRace)

#Hispanic
cps$demHisp <- cps$PEHSPNON

#Create combined race and hispanic measure, like in origonal data set
cps$demRaceFull <- rep(NA, length(cps$GESTFIPS))
cps$demRaceFull[cps$demRace == 1 & cps$demHisp == 2 ] <- 1 #White, not hispanic
cps$demRaceFull[cps$demRace == 2 & cps$demHisp == 2 ] <- 2 #Black, not hispanic
cps$demRaceFull[cps$demRace == 3 & cps$demHisp == 2 ] <- 3 #Native American, not hispanic
cps$demRaceFull[cps$demRace == 4 & cps$demHisp == 2 ] <- 4 #Asian, not hispanic
cps$demRaceFull[cps$demRace == 5 & cps$demHisp == 2 ] <- 5 #Other, not hispanic
cps$demRaceFull[cps$demHisp == 1 ] <- 6 #Hispanic, any race
table(cps$demRaceFull)


#Gender
cps$demGender <- cps$PESEX
table(cps$demGender)
# 1  Male
# 2  Female


#Age
cps<-  subset(cps, cps$RECODE1 != 1)
table(cps$RECODE1)
cps$ageDem <- recode(cps$RECODE1, "2=1;3=2;4=3;5=4;6=5;7=6;8=7;else=NA")
table(cps$ageDem)
levels(cps$ageDem) <- levels(cps$RECODE1)[2:8]

#Education
cps$edu5 <- recode(cps$RECODE3, "6 = NA")
table(cps$edu5)
levels(cps$edu5) <- levels(cps$RECODE3)[1:5]



#Income
cps$demInc <- cps$RECODE4
table(cps$demInc)
# 1  Under 20 thousand - {1 ,2 ,3 ,4 ,5 ,6}
# 2  20 to 50 thousand - {7 ,8 ,9 ,10 ,11}
# 3  50 to 75 thousand - {12 ,13}
# 4  75 to 100 thousand - {14}
# 5  100 thousand or more - {15 ,16}




################
#-----------Create Weighted Proportion tables for target parameters
#-----------SAMPLE 1: SLIMMED OUT SURVEY ON JUST WEIGHT VARIABLES 
################


#-------------Black Sample
#Subset
table(cps$demRaceFull)
bcps <- subset(cps, cps$demRaceFull == 2)
table(bcps$demRaceFull)

#Define target parameters for weight

demos <- c("demGender", "ageDem", "edu5", "demInc")

#Loop through parameters to create weighted tables
for (i in seq_along(demos)) {    #for each population parameter (demos)
  x <- bcps[, demos[i]]    #identify the column with name cps$[demos/variable name]
  a <- wtd.table(x, weights = bcps$wts_cps)  #construct a weighted table of counts
  a <- a / sum(a) #Turn counts intro proportions
  a <- data.frame(a) #turn list into dataframe
  a0 <- cbind(rep(demos[i], nlevels(as.factor(x))), a)  #identify each newly constructed table with the correspondeding variable it comes from / demos
  if (i == 1)
    b <- a0
  else
    b <- rbind(b, a0)
}
b


#Clean Table
colnames(b) <- c("Item",  "Value", "Pop_Prop")  #rename columns

cps_b_parameters <- b
cps_b_parameters

cps_b_parameters$Pop_Count <- cps_b_parameters$Pop_Prop *  nrow(data.slim1)
cps_b_parameters <- cps_b_parameters[-3] ##takes out proportion
cps_b_parameters


#Create separate tables for each of the parameters
cps_b_parameters
x <- split(cps_b_parameters, cps_b_parameters$Item)
x

gen.dist <- x$demGender
gen.dist <- gen.dist[-1]
colnames(gen.dist) <- c("demGender", "Freq")
gen.dist

age.dist <- x$ageDem
age.dist <- age.dist[-1]
colnames(age.dist) <- c("ageDem", "Freq")
age.dist

edu.dist <- x$edu5
edu.dist <- edu.dist[-1]
colnames(edu.dist) <- c("edu5", "Freq")
edu.dist

inc.dist <- x$demInc
inc.dist <- inc.dist[-1]
colnames(inc.dist) <- c("demInc", "Freq")
inc.dist



#Construct Black sample weight
black.unweight = svydesign(ids = ~1, data = data.slim1)
black.rake  = rake(design = black.unweight,
                    sample.margins = list(~demGender, ~ageDem, ~edu5, ~demInc),  
                    population.margins = list(gen.dist, age.dist, edu.dist, inc.dist))
summary(weights(black.rake))

#trim weight
black_sam = trimWeights(black.rake, 
                         lower= 0.5,
                         upper=6,
                         strict=TRUE)
summary(weights(black_sam))
length(data.slim1$respid)

#Save data with new sample weights
data.slim1$wts_black = weights(black_sam)
summary(data.slim1$wts_black)



write.csv(data.slim1, "Weighted Data for Paper.csv")





##CREATE SECOND WEIGHT FOR APPENDIX




################
#-----------Create Weighted Proportion tables for target parameters
#-----------SAMPLE 2: ALL VARIABLES FOR ANALYSES data.slim5
################



cps_b_parameters2 <- b
cps_b_parameters2
cps_b_parameters2$Pop_Count <- cps_b_parameters2$Pop_Prop *  nrow(data.slim5)
cps_b_parameters2 <- cps_b_parameters2[-3]
cps_b_parameters2


#Create separate tables for each of the parameters
cps_b_parameters2
x <- split(cps_b_parameters2, cps_b_parameters2$Item)
x

gen.dist <- x$demGender
gen.dist <- gen.dist[-1]
colnames(gen.dist) <- c("demGender", "Freq")
gen.dist

age.dist <- x$ageDem
age.dist <- age.dist[-1]
colnames(age.dist) <- c("ageDem", "Freq")
age.dist

edu.dist <- x$edu5
edu.dist <- edu.dist[-1]
colnames(edu.dist) <- c("edu5", "Freq")
edu.dist

inc.dist <- x$demInc
inc.dist <- inc.dist[-1]
colnames(inc.dist) <- c("demInc", "Freq")
inc.dist



#Construct Black sample weight
black.unweight <- svydesign(ids=~1, data=data.slim5)
black.rake  <- rake(design = black.unweight,
                    sample.margins = list(~demGender, ~ageDem, ~edu5, ~demInc), 
                    population.margins = list(gen.dist, age.dist, edu.dist, inc.dist))
summary(weights(black.rake))

#trim weight
black_sam <- trimWeights(black.rake, 
                         lower= 0.5,
                         upper=6,
                         strict=TRUE)
summary(weights(black_sam))
length(data.slim5$respid)

data.slim5$wts_black <- weights(black_sam)
summary(data.slim5$wts_black)



#Save data with new sample weights


write.csv(data.slim5, "Alternative Weighted Data for Appendix.csv")








